library(limma)
suppressPackageStartupMessages(library(plotly))
library(Rtsne)
library(DT)
library(ggrepel)
library(gridExtra)
library(ggpubr)
library(data.table)
library(tidyr)
library(dplyr)
library(tidyverse)
library(edgeR)baseFolder <- "/Users/yzha0247/Dropbox (Sydney Uni)/mousediet/heartMouse"
sampleInfo <- read.csv(file.path(baseFolder, "mouseDietInfo.csv"))
countsGenes <- read.delim(file.path(baseFolder, "unnormalisedGeneCounts.txt"))
#the expression value
countsGenes[1:3,1:3]## GC112 GC116 GC191
## Gnai3 2151 1396 1057
## Pbsn 0 0 0
## Cdc45 1422 560 581
## [1] 53546 36
sampleInfo <- sampleInfo[match(colnames(countsGenes), sampleInfo[, "Sample"]), ]
sampleInfo[, "Diet"] <- factor(sampleInfo[, "Diet"], levels = c("Chow", "Glucose", "Sucrose"))
#the group info
table(sampleInfo$Diet)##
## Chow Glucose Sucrose
## 16 13 7
## Sample Diet
## 28 GC112 Glucose
## 29 GC116 Glucose
## 30 GC191 Glucose
## 31 GC195 Glucose
## 32 GC196 Glucose
## 33 GC197 Glucose
## 34 GC201 Glucose
## 35 GC202 Glucose
## 36 GC204 Glucose
## 37 GC206 Glucose
## 18 GC222 Sucrose
## 19 GC226 Sucrose
## 20 GC229 Sucrose
## 38 SL11 Glucose
## 23 SL114 Sucrose
## 1 SL142 Chow
## 2 SL145 Chow
## 39 SL15 Glucose
## 40 SL19 Glucose
## 8 SL221 Chow
## 9 SL223 Chow
## 10 SL225 Chow
## 12 SL227 Chow
## 7 SL2310 Chow
## 3 SL233 Chow
## 4 SL237 Chow
## 5 SL238 Chow
## 6 SL239 Chow
## 13 SL241 Chow
## 14 SL242 Chow
## 15 SL243 Chow
## 16 SL244 Chow
## 17 SL245 Chow
## 21 SL32 Sucrose
## 26 SL43 Sucrose
## 27 SL44 Sucrose
##
## FALSE
## 1927656
#well, if use voom and also account for the gene length, no need to log2(x+1) transform the data
#but i think, we still can log and then voom
# https://www.bioconductor.org/packages/devel/workflows/vignettes/RNAseq123/inst/doc/limmaWorkflow.html
#expressiondt=log2(countsGenes+1)
#boxplot(expressiondt)
# account for the gene length, using the counts per million, cpm function
expressiondt=cpm(countsGenes)
# check 0 counts
table(rowSums(expressiondt==0)==36) #22468 genes have all mice value 0 ##
## FALSE TRUE
## 31078 22468
expressionkeep <- names(which(rowSums(expressiondt==0)<18))
expressiondt <- expressiondt[rownames(expressiondt)%in%expressionkeep,] #keep rows(genes) half of mice not 0
dim(expressiondt)## [1] 20872 36
# scale the data before pca
pcadt1=scale(expressiondt)
pca=prcomp(t(pcadt1)) # have to use this function, original function doesnt work
df_toplot <- data.frame(sampleInfo$Diet, pc1 = pca$x[,1], pc2 = pca$x[,2] )
colnames(df_toplot)[1]="group"
g <- ggplot(df_toplot, aes(x = pc1, y = pc2, color = group)) +
geom_point() + ggtitle("pca plot")
gdgel <- DGEList(counts=expressiondt, group=factor(sampleInfo$Diet))
dgel2 <- calcNormFactors(dgel)
groupname<-as.factor(sampleInfo$Diet)
design <- model.matrix(~ groupname + 0)
y <- voom(dgel2, design)
fit <- lmFit(y, design)
cont.matrix <- makeContrasts(contrasts = c("groupnameGlucose-groupnameChow", "groupnameSucrose-groupnameChow","groupnameSucrose-groupnameGlucose"),levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
limma.res=topTable(fit2,n=Inf,sort="p", adjust.method = "BH",coef = 1)
datatable(format(limma.res,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8),caption = "full table")datatable(format(limma.res,digits=3)[limma.res$adj.P.Val<0.05,],options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8),caption = "adjustedpval<0.05")# #filter average expression>7
# limma.res=limma.res[limma.res$AveExpr>7,]
# datatable(limma.res,options = list(
# searching = TRUE, rownames= FALSE,
# pageLength = 5,
# lengthMenu = c(5, 10, 15, 20),
# width=8))
dt=limma.res
dt$name=rownames(dt)
p1 <- ggplot(dt, aes(x = AveExpr, y = logFC,label=name))+
geom_point(aes(colour=-log10(P.Value)), alpha=1/3, size=1) +
scale_colour_gradient(low="blue",high="red")+
ylab("log2 fold change") + xlab("AveExpr")+ggtitle("GlucoseVsChow")
p2 <- ggplot(dt, aes(logFC,-log10(P.Value),label=name))+
geom_point(aes(colour=-log10(P.Value)), alpha=1/3, size=1) +
scale_colour_gradient(low="blue",high="red")+
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("GlucoseVsChow")
ggarrange(p1,p2, ncol=2, nrow=1)suppressPackageStartupMessages(library(org.Mm.eg.db))
limma.res$entrez = mapIds(org.Mm.eg.db,
keys=row.names(limma.res),
column="ENTREZID",
keytype="SYMBOL",
multiVals="first")
fc <- limma.res$logFC
names(fc) <- as.numeric(limma.res$entrez)
# require(gage)
# require(gageData)
# kegg = kegg.gsets(species = "mmu", id.type = "kegg")
# kegg.sets.hsa = kegg$kg.sets
# fc.kegg.p <- gage(fc, gsets = kegg.sets.hsa, ref = NULL, samp = NULL, rank.test = TRUE)
require(gage)
library("reactome.db")
xx <- as.list(reactomePATHID2EXTID)
#keys <- c("100034361", "100037258")
#select(reactome.db, limma.res$entrez, c("PATHNAME"), 'ENTREZID')
fc.kegg.p <- gage(fc, gsets = xx, ref = NULL, samp = NULL, rank.test = TRUE)
updt=as.data.frame(fc.kegg.p[[1]])[,-c(1,6)]
colnames(updt)=c("enrichment score","p","FDR","set.size")
updt$DB_ID=rownames(updt)
yy <- as.data.frame(reactomePATHNAME2ID)
updt=dplyr::left_join(updt,yy,by="DB_ID")
updt=na.omit(updt[!duplicated(updt$path_name),])
rownames(updt)=updt$path_name
datatable(format(updt,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8
))downdt=as.data.frame(fc.kegg.p[[2]])[,-c(1,6)]
colnames(downdt)=c("enrichment score","p","FDR","set.size")
downdt$DB_ID=rownames(downdt)
yy <- as.data.frame(reactomePATHNAME2ID)
downdt=dplyr::left_join(downdt,yy,by="DB_ID")
downdt=na.omit(downdt[!duplicated(downdt$path_name),])
rownames(downdt)=downdt$path_name
datatable(format(downdt,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8
))both = rbind(downdt, updt)
selectionn=both%>%
filter(p < 0.05,set.size>200,FDR<0.05)
colnames(selectionn)[1]="stat.mean"
datatable(format(selectionn,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8
),caption = "restricted table to p < 0.05,set.size>200,FDR<0.05")library(ggrepel )
ggplot(selectionn, aes(x = stat.mean, y = p)) + geom_point( aes(color = set.size), size = 3) + labs(x = "Enrichment score", y = "p", title = "GSEA for significantly downregulated and upregulated pathways") +
geom_text_repel(aes(label=path_name), size=3) +
scale_colour_gradient(high = "red", low = "blue") +
scale_x_continuous(
labels = scales::number_format(accuracy = 0.01))########################################################################################
# this .gmt thing not work because the names are not the same, mouse entrez and symbols are not the same as human
# #name matching
# library(UniProt.ws)
# help('availableUniprotSpecies')
# lookupUniprotSpeciesFromTaxId(10090)
# database <- UniProt.ws(taxId = 10090)
# entrezz = UniProt.ws::select(database,
# keys=rownames(limma.res),
# columns="ENTREZ_GENE",
# keytype="UNIPROTKB_ID")
# saveRDS(entrezz,"entrezid.rds")
# entrezz=readRDS("entrezid.rds")
# entrezz2 <- entrezz[!duplicated(entrezz$UNIPROTKB_ID ),]
# limma.res$entrez=as.numeric(entrezz2[,2])
#library(qusage)
#filee=read.gmt("/Users/yzha0247/Dropbox (Sydney Uni)/mousediet/c2.cp.reactome.v7.4.entrez.gmt")
#filee=read.gmt("/Users/yzha0247/Dropbox (Sydney Uni)/mousediet/c2.cp.reactome.v7.4.symbols.gmt")
#fc <- limma.res$logFC
#names(fc) <- rownames(limma.res)
#require(gage)
#fc.kegg.p <- gage(fc, gsets = filee, ref = NULL, samp = NULL, rank.test = TRUE)
########################################################################################dgel <- DGEList(counts=expressiondt, group=factor(sampleInfo$Diet))
dgel2 <- calcNormFactors(dgel)
groupname<-as.factor(sampleInfo$Diet)
design <- model.matrix(~ groupname + 0)
y <- voom(dgel2, design)
fit <- lmFit(y, design)
cont.matrix <- makeContrasts(contrasts = c("groupnameGlucose-groupnameChow", "groupnameSucrose-groupnameChow","groupnameSucrose-groupnameGlucose"),levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
limma.res=topTable(fit2,n=Inf,sort="p", adjust.method = "BH",coef = 2)
datatable(format(limma.res,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8),caption = "full table")datatable(format(limma.res,digits=3)[limma.res$adj.P.Val<0.05,],options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8),caption = "adjustedpval<0.05")# #filter average expression>7
# limma.res=limma.res[limma.res$AveExpr>7,]
# datatable(limma.res,options = list(
# searching = TRUE, rownames= FALSE,
# pageLength = 5,
# lengthMenu = c(5, 10, 15, 20),
# width=8))
dt=limma.res
dt$name=rownames(dt)
p1 <- ggplot(dt, aes(x = AveExpr, y = logFC,label=name))+
geom_point(aes(colour=-log10(P.Value)), alpha=1/3, size=1) +
scale_colour_gradient(low="blue",high="red")+
ylab("log2 fold change") + xlab("AveExpr")+ggtitle("GlucoseVsChow")
p2 <- ggplot(dt, aes(logFC,-log10(P.Value),label=name))+
geom_point(aes(colour=-log10(P.Value)), alpha=1/3, size=1) +
scale_colour_gradient(low="blue",high="red")+
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("GlucoseVsChow")
ggarrange(p1,p2, ncol=2, nrow=1)suppressPackageStartupMessages(library(org.Mm.eg.db))
limma.res$entrez = mapIds(org.Mm.eg.db,
keys=row.names(limma.res),
column="ENTREZID",
keytype="SYMBOL",
multiVals="first")
fc <- limma.res$logFC
names(fc) <- limma.res$entrez
require(gage)
library("reactome.db")
xx <- as.list(reactomePATHID2EXTID)
#keys <- c("100034361", "100037258")
#select(reactome.db, limma.res$entrez, c("PATHNAME"), 'ENTREZID')
fc.kegg.p <- gage(fc, gsets = xx, ref = NULL, samp = NULL, rank.test = TRUE)
updt=as.data.frame(fc.kegg.p[[1]])[,-c(1,6)]
colnames(updt)=c("enrichment score","p","FDR","set.size")
updt$DB_ID=rownames(updt)
yy <- as.data.frame(reactomePATHNAME2ID)
updt=dplyr::left_join(updt,yy,by="DB_ID")
updt=na.omit(updt[!duplicated(updt$path_name),])
rownames(updt)=updt$path_name
datatable(format(updt,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8
))downdt=as.data.frame(fc.kegg.p[[2]])[,-c(1,6)]
colnames(downdt)=c("enrichment score","p","FDR","set.size")
downdt$DB_ID=rownames(downdt)
yy <- as.data.frame(reactomePATHNAME2ID)
downdt=dplyr::left_join(downdt,yy,by="DB_ID")
downdt=na.omit(downdt[!duplicated(downdt$path_name),])
rownames(downdt)=downdt$path_name
datatable(format(downdt,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8
))both = rbind(downdt, updt)
selectionn=both%>%
filter(p < 0.05,set.size>200,FDR<0.05)
colnames(selectionn)[1]="stat.mean"
datatable(format(selectionn,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8
),caption = "restricted table to p < 0.05,set.size>200,FDR<0.05")library(ggrepel )
ggplot(selectionn, aes(x = stat.mean, y = p)) + geom_point( aes(color = set.size), size = 3) + labs(x = "Enrichment score", y = "p", title = "GSEA for significantly downregulated and upregulated pathways") +
geom_text_repel(aes(label=path_name), size=3) +
scale_colour_gradient(high = "red", low = "blue") +
scale_x_continuous(
labels = scales::number_format(accuracy = 0.01))dgel <- DGEList(counts=expressiondt, group=factor(sampleInfo$Diet))
dgel2 <- calcNormFactors(dgel)
groupname<-as.factor(sampleInfo$Diet)
design <- model.matrix(~ groupname + 0)
y <- voom(dgel2, design)
fit <- lmFit(y, design)
cont.matrix <- makeContrasts(contrasts = c("groupnameGlucose-groupnameChow", "groupnameSucrose-groupnameChow","groupnameSucrose-groupnameGlucose"),levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
limma.res=topTable(fit2,n=Inf,sort="p", adjust.method = "BH",coef = 3)
datatable(format(limma.res,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8),caption = "full table")datatable(format(limma.res,digits=3)[limma.res$adj.P.Val<0.05,],options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8),caption = "adjustedpval<0.05")# #filter average expression>7
# limma.res=limma.res[limma.res$AveExpr>7,]
# datatable(limma.res,options = list(
# searching = TRUE, rownames= FALSE,
# pageLength = 5,
# lengthMenu = c(5, 10, 15, 20),
# width=8))
dt=limma.res
dt$name=rownames(dt)
p1 <- ggplot(dt, aes(x = AveExpr, y = logFC,label=name))+
geom_point(aes(colour=-log10(P.Value)), alpha=1/3, size=1) +
scale_colour_gradient(low="blue",high="red")+
ylab("log2 fold change") + xlab("AveExpr")+ggtitle("GlucoseVsChow")
p2 <- ggplot(dt, aes(logFC,-log10(P.Value),label=name))+
geom_point(aes(colour=-log10(P.Value)), alpha=1/3, size=1) +
scale_colour_gradient(low="blue",high="red")+
xlab("log2 fold change") + ylab("-log10 p-value")+ggtitle("GlucoseVsChow")
ggarrange(p1,p2, ncol=2, nrow=1)suppressPackageStartupMessages(library(org.Mm.eg.db))
limma.res$entrez = mapIds(org.Mm.eg.db,
keys=row.names(limma.res),
column="ENTREZID",
keytype="SYMBOL",
multiVals="first")
fc <- limma.res$logFC
names(fc) <- limma.res$entrez
require(gage)
library("reactome.db")
xx <- as.list(reactomePATHID2EXTID)
#keys <- c("100034361", "100037258")
#select(reactome.db, limma.res$entrez, c("PATHNAME"), 'ENTREZID')
fc.kegg.p <- gage(fc, gsets = xx, ref = NULL, samp = NULL, rank.test = TRUE)
updt=as.data.frame(fc.kegg.p[[1]])[,-c(1,6)]
colnames(updt)=c("enrichment score","p","FDR","set.size")
updt$DB_ID=rownames(updt)
yy <- as.data.frame(reactomePATHNAME2ID)
updt=dplyr::left_join(updt,yy,by="DB_ID")
updt=na.omit(updt[!duplicated(updt$path_name),])
rownames(updt)=updt$path_name
datatable(format(updt,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8
))downdt=as.data.frame(fc.kegg.p[[2]])[,-c(1,6)]
colnames(downdt)=c("enrichment score","p","FDR","set.size")
downdt$DB_ID=rownames(downdt)
yy <- as.data.frame(reactomePATHNAME2ID)
downdt=dplyr::left_join(downdt,yy,by="DB_ID")
downdt=na.omit(downdt[!duplicated(downdt$path_name),])
rownames(downdt)=downdt$path_name
datatable(format(downdt,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8
))both = rbind(downdt, updt)
selectionn=both%>%
filter(p < 0.05,set.size>200,FDR<0.05)
colnames(selectionn)[1]="stat.mean"
datatable(format(selectionn,digits=3),options = list(
searching = TRUE, rownames= FALSE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
width=8
),caption = "restricted table to p < 0.05,set.size>200,FDR<0.05")library(ggrepel )
ggplot(selectionn, aes(x = stat.mean, y = p)) + geom_point( aes(color = set.size), size = 3) + labs(x = "Enrichment score", y = "p", title = "GSEA for significantly downregulated and upregulated pathways") +
geom_text_repel(aes(label=path_name), size=3) +
scale_colour_gradient(high = "red", low = "blue") +
scale_x_continuous(
labels = scales::number_format(accuracy = 0.01))